ggplot2::ggplot() flyovercolor vs. fill:: function to reference functions
inside packages.
select is a very common function name. Usually we need
to use dplyr::select```r
library(tidyverse)
head(diamonds)
#?filter
#filter from dplyr
diamonds %>%
dplyr::filter(color == \E\) %>%
head()
<!-- rnb-source-end -->
<!-- rnb-frame-begin eyJtZXRhZGF0YSI6eyJjbGFzc2VzIjpbInRibF9kZiIsInRibCIsImRhdGEuZnJhbWUiXSwibnJvdyI6NiwibmNvbCI6MTAsInN1bW1hcnkiOnsiQSB0aWJibGUiOiI2IMOXIDEwIn19LCJyZGYiOiJINHNJQUFBQUFBQUFBNTFVUFd6VFFCUSsvOFFtbGtoU3RYTmhaRW1rTkFzTHlnMXQwa0JKVWdwUmh5N1grTkphT0had0hLQmxZZW5jR1ZZa3hzNGRFU09pRERBd2QyR0JsWVdoNFozdm51TjY3RWxQOTczbmU5KzllL2VkbjZ6dk5weGRoeEJpRUZQWGlGRUFTQXJQbnJhcTl3bEV3TkdJU1lwaWZnMkxsZ0dJMVNVd3Eza3grZkRUZWZDaStlV3lmYlp5ZkpuNjMwLzM3djM5Zk5yOEFSNkVtdC9PdDMrZnJKNUQvbTJSSjNZQU04SDB6R3prTnJSOC9wTDdVMENWSkVOR3pSYnpJc1R0TUhRVkxnNTRkSFEzRTdEN0VSOTdzN0Z5Q3gyWE16KzNSV0hvc3ludW9HTmlHTGs4NHNoampkZ3dEaU5BVjVrRDZCa1R2aTFOc090elJXaGo1OVlSYkNCb0lXZ2oyRVRRUWZCUTBobnptOVpuNUJwdDVlcTdwUmowVGwwaFk2ZXp0b0JwZExDenRvQVlOUWVMc01EMWxLMTFnOEtsb21qM2s5QWZvWTkvamNTZ1c5T0dHTFRYVVBOSEdlOGwweWpOZS9SVzVuWFYzS2VLaDF5ZnQrUjNHS3BSY0JQUzJtQmRzQjVZUCtVdHZ4UGpQUzJ2Q2duZm9SVlZSK1dyR0JkMEtSSDZIaTJmSkFzV2VWTDV0Q1NWVHl2SkE5bWtGZUZ0LzZGTHNuNWFGdUgyV1pwbnlCZEVkUkZkZVpQT2hZVE9vWlk2aHlINTgzSU8ySmlqbkoxVTR5eGlNZDdmY0JhbjhkQVA4U1haOEJBaUx6N0NieTZmeElmb3hHemY1K2hNSW0rSUR2d1NGTUJFN1RoWFVqRUtYOVd3TE5GelhkekFmRDcvbHhNSkpqZ3VpMWx0RkVHS1ZNWTFPanVjeEY0WUFKbStySlNlVGRZaUl2OVBtckxzeC9Jc0VJVzQxZUhoTEhoZXJXZVdvaUEweFluWWtkdWJXS1NGMHVYQmdSZGdHMnc0M1pRZHBDM3kyVDczbFZPQzh5ZkhyMEhqQXV5OUE5RnBMUTVqaHVzY3VBMk15QWR4OVI5L3VLam5uQVVBQUE9PSJ9 -->
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["carat"],"name":[1],"type":["dbl"],"align":["right"]},{"label":["cut"],"name":[2],"type":["ord"],"align":["right"]},{"label":["color"],"name":[3],"type":["ord"],"align":["right"]},{"label":["clarity"],"name":[4],"type":["ord"],"align":["right"]},{"label":["depth"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["table"],"name":[6],"type":["dbl"],"align":["right"]},{"label":["price"],"name":[7],"type":["int"],"align":["right"]},{"label":["x"],"name":[8],"type":["dbl"],"align":["right"]},{"label":["y"],"name":[9],"type":["dbl"],"align":["right"]},{"label":["z"],"name":[10],"type":["dbl"],"align":["right"]}],"data":[{"1":"0.23","2":"Ideal","3":"E","4":"SI2","5":"61.5","6":"55","7":"326","8":"3.95","9":"3.98","10":"2.43"},{"1":"0.21","2":"Premium","3":"E","4":"SI1","5":"59.8","6":"61","7":"326","8":"3.89","9":"3.84","10":"2.31"},{"1":"0.23","2":"Good","3":"E","4":"VS1","5":"56.9","6":"65","7":"327","8":"4.05","9":"4.07","10":"2.31"},{"1":"0.29","2":"Premium","3":"I","4":"VS2","5":"62.4","6":"58","7":"334","8":"4.20","9":"4.23","10":"2.63"},{"1":"0.31","2":"Good","3":"J","4":"SI2","5":"63.3","6":"58","7":"335","8":"4.34","9":"4.35","10":"2.75"},{"1":"0.24","2":"Very Good","3":"J","4":"VVS2","5":"62.8","6":"57","7":"336","8":"3.94","9":"3.96","10":"2.48"}],"options":{"columns":{"min":{},"max":[10],"total":[10]},"rows":{"min":[10],"max":[10],"total":[6]},"pages":{}}}
</script>
</div>
<!-- rnb-frame-end -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuYGBgclxuTkFcbk5BXG5OQVxuYGBgXG5gYGAifQ== -->
```r
```r
NA
NA
NA
<!-- rnb-source-end -->
<!-- rnb-frame-begin eyJtZXRhZGF0YSI6eyJjbGFzc2VzIjpbInRibF9kZiIsInRibCIsImRhdGEuZnJhbWUiXSwibnJvdyI6NiwibmNvbCI6MTAsInN1bW1hcnkiOnsiQSB0aWJibGUiOiI2IMOXIDEwIn19LCJyZGYiOiJINHNJQUFBQUFBQUFBNTFVTzJ3VFFSRGQrOWpHSjNGMkZCcWFRQmtLVzNMYzBFVGVJdGd4SDJNbllBV0padVBiSkNmT2QyWjlCbUlhbXRUVTBOS25kb25vUU5BaFVhZWhnUllLaXBqWjI1M3pCNnFzTk5vM2N6T3pzN052Ym1kcnIrcnNPWVFRaTlpbVFhd01RSko1K0tCZXVrbkFBb3BCYkpLWCt3dHdXZ1VndlYyUWJPM0w0TjAzWi9OcDdkTlo0L1RLK0N6VlA2Ly8rdkQ2OFkzYXh6ZHl2YTE5SFV1SGx4Qi9XY2JKRTBCc0VIV0F3dmJTZ2RtQVArUEJFRkF4aVZCV3U4NThnYmdSUlo3RytTNFh4OWZuRExtMjRIMS8xTmRxcHVseEZpd2RrZWtGYklnbm1CZ1lDWThMam5teUI2d1hSd0xRK2R3RnpQK0x6RzVPZGNJY2RtNEx3UzBFZFFRTkJOc0ltZ2h1cTNUVzlLTDFXZjgyZXFHK1N6cUQyYXhvWk8wMk4yWXd0WFozTjJZUXJYWjNacGE0a21hclg2Qnd4U2phZWkvNVIraTk3d2R5MGJ2RHFseTBYVmQ2NjJyQ0tOcFM5alR1emlzVjE5SjdteTdxOTRuT1MzRHBSc0ZMS0dtQWRFQWVLZEY1QzRyQnRMQTI2Znc0dVVhTFZWV1BLd20rL3B1NllGMmI3TkNWazhSaEZnZHpBTU5BM1VuaVFJdkpnR3hUZHpNWkVlcnFlbGFrdVhHYXhsbHFncWlwSmliZExabWw4NU9hYXJKb1J2a3QwemxrZlk1MGRsS09NOEZpZkwvZUtFN3RVUkRoSk9WZ0VJUWZIK00zancvaUkxUml0aDl3VkFiQzc2RUN2d1FOTU5BWUw1V1VGOUh6TXBZbGUyN0tGNWxPcDMrV1NJSUJqc2RpVmo0UUVLS1lzWkF1RncxaVB3b2htYm1xbVQ0ZmJBaWkvaytHbHZtUGhWRW9DL0ZLdmFOUitLUlVtWE5GUWhnNkoySkhIVzlqa1Zta0xnOFAvUkRia0lQYkRkbGgycUtBN2ZOQUt5N2NQN2wrR1JvWFl1OGRzQTdMY1JRejlIUGdOZENpQnVMOEwxTXlwNzZjQlFBQSJ9 -->
<div data-pagedtable="false">
<script data-pagedtable-source type="application/json">
{"columns":[{"label":["carat"],"name":[1],"type":["dbl"],"align":["right"]},{"label":["cut"],"name":[2],"type":["ord"],"align":["right"]},{"label":["color"],"name":[3],"type":["ord"],"align":["right"]},{"label":["clarity"],"name":[4],"type":["ord"],"align":["right"]},{"label":["depth"],"name":[5],"type":["dbl"],"align":["right"]},{"label":["table"],"name":[6],"type":["dbl"],"align":["right"]},{"label":["price"],"name":[7],"type":["int"],"align":["right"]},{"label":["x"],"name":[8],"type":["dbl"],"align":["right"]},{"label":["y"],"name":[9],"type":["dbl"],"align":["right"]},{"label":["z"],"name":[10],"type":["dbl"],"align":["right"]}],"data":[{"1":"0.23","2":"Ideal","3":"E","4":"SI2","5":"61.5","6":"55","7":"326","8":"3.95","9":"3.98","10":"2.43"},{"1":"0.21","2":"Premium","3":"E","4":"SI1","5":"59.8","6":"61","7":"326","8":"3.89","9":"3.84","10":"2.31"},{"1":"0.23","2":"Good","3":"E","4":"VS1","5":"56.9","6":"65","7":"327","8":"4.05","9":"4.07","10":"2.31"},{"1":"0.22","2":"Fair","3":"E","4":"VS2","5":"65.1","6":"61","7":"337","8":"3.87","9":"3.78","10":"2.49"},{"1":"0.20","2":"Premium","3":"E","4":"SI2","5":"60.2","6":"62","7":"345","8":"3.79","9":"3.75","10":"2.27"},{"1":"0.32","2":"Premium","3":"E","4":"I1","5":"60.9","6":"58","7":"345","8":"4.38","9":"4.42","10":"2.68"}],"options":{"columns":{"min":{},"max":[10],"total":[10]},"rows":{"min":[10],"max":[10],"total":[6]},"pages":{}}}
</script>
</div>
<!-- rnb-frame-end -->
<!-- rnb-chunk-end -->
<!-- rnb-text-begin -->
## Building Graphics
1. Draw by hand (or imagine) the specific plot that you intend to construct
2. Data Wrangling (if needed) to get the data in glyph-ready form, or verify that the current form is glyph-ready for your purposes.
3. Establish the frame using a `ggplot()` statement
4. Create the intended glyph using `geom_[style]()` such as
- `geom_point()`
- `geom_bar()`
- `geom_boxplot()`
- `geom_density()`
- `geom_vline()`
- `geom_segment()`
- `geom_histogram()`
- and *many* more
5. Map variables to the graphical attributes of the glyph using: `aes( )`
- Rule of thumb: anytime when you are plotting with ggplot, ALL variables need to be inside an `aes` (except facets, later in slides), and all constants go outside of the `aes`.
- e.g. `geom_point(aes(color = gender))` vs. `geom_point(color = "red")`
6. Add additional layers to the frame using the `+` symbol
- Note: **not** `%>%` between layers of `ggplot2` graphics
- Think `+` is equivalent of "add layer on top of ..." in `ggplot2` portions, whereas `%>%` is "and then the next step is..."
*Steps 4 and 5 can be switched.*

## Example: Baby Names
Let's look at our `BabyNames` names data set agian.
<!-- rnb-text-end -->
<!-- rnb-chunk-begin -->
<!-- rnb-source-begin eyJkYXRhIjoiYGBgclxuIyBpbnNwZWN0IGRhdGEgaW50YWtlXG5nbGltcHNlKEJhYnlOYW1lcylcblxuYGBgIn0= -->
```r
# inspect data intake
glimpse(BabyNames)
Rows: 1,792,091
Columns: 4
$ name <chr> "Mary", "Anna", "Emma", "Elizabeth", "Minnie", "Margaret", "Ida", "…
$ sex <chr> "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F…
$ count <int> 7065, 2604, 2003, 1939, 1746, 1578, 1472, 1414, 1320, 1288, 1258, 1…
$ year <int> 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1880, 1…
The graph looks perfectly fine, but this code isn’t easy to read.
This is why we stress writing readable code!
Nothing is here! That is exactly what is supposed to happen. Calling
ggplot() only tells us R that we are ready to plot and I
want to create some space to make my plot.
ggplot(data = Names)
NA
Still Nothing! We need to tell it what our axis are.
Note that ggplot uses +, NOT %>%. This
is because we are adding layers to our plots.
ggplot(data = Names) +
geom_line()
Error in `geom_line()`:
! Problem while setting up geom.
ℹ Error occurred in the 1st layer.
Caused by error in `compute_geom_1()`:
! `geom_line()` requires the following missing aesthetics: x and y
Backtrace:
1. base (local) `<fn>`(x)
2. ggplot2:::print.ggplot(x)
4. ggplot2:::ggplot_build.ggplot(x)
5. ggplot2:::by_layer(...)
12. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
13. l$compute_geom_1(d)
14. ggplot2 (local) compute_geom_1(..., self = self)
Note - this is why I like to map aesthetics first, so we can avoid errors.
Rule of thumb: anytime when you are plotting with ggplot, ALL
variables need to be inside an aes (except facets, later in
slides), and all constants go outside of the aes.
#not Quite
ggplot(data = Names) +
geom_line( aes(x = year, y = total))
#add groups
ggplot(data = Names) +
geom_line( aes(x = year, y = total, group = name))
#add color
ggplot(data = Names) +
geom_line( aes(x = year, y = total, color = name))
NA
NA
ggplot(data = Names) +
geom_line( aes(x = year, y = total, color = name)) +
ggtitle("Names Over Time") +
xlab("Year") +
ylab("Popularity") +
guides(color = guide_legend(title = "Siblings Names" ))
NA
NA
ggplot(data = Names) +
geom_line( aes(x = year, y = total, color = name, linetype = name)) +
ggtitle("Names Over Time") +
xlim(c(1972, 2022))+
xlab("Year") +
ylab("Popularity") +
guides(color = guide_legend(title = "Siblings Names" ),
linetype = guide_legend(title = "Still Siblings Names" ))
NA
NA
facet_wrap()The syntax for facets requires a formula syntax we haven’t seen much yet. There are two main ways to plot with facets. Here are a few pointers:
facet_wrap() just makes a seperate plot for each level
of the categorical variable
facet_wrap( ~ categoricalVariable)facet_grid() allows control of row & column
facetsfacet_grid() syntax:
facet_grid(rows ~ cols)facet_grid( rows ~ . ) (note the
required “.”)facet_grid( ~ cols) (no
“.” this time)color and filllibrary(mosaicData)
head(CPS85)
CPS85 %>%
ggplot() +
geom_density(aes(x = wage, color = sex), alpha = 0.4)+
facet_grid( ~ married) +
xlim(0,30)
CPS85 %>%
ggplot() +
geom_density(aes(x = wage, fill = sex), alpha = 0.4)+
facet_grid( ~ married) +
xlim(0,30)
CPS85 %>%
ggplot() +
geom_density(aes(x = wage, fill = sex, color = sex), alpha = 0.4)+
facet_grid( ~ married) +
xlim(0,30)
CPS85%>%
ggplot(aes(x = married, color = sex)) +
geom_bar() +
facet_wrap( ~ union, scales = "free") #Note the scales here
CPS85%>%
ggplot(aes(x = married, fill = sex)) +
geom_bar()+
facet_wrap( ~ union, scales = "free") #Note the scales here
CPS85%>%
ggplot(aes(x = age, y = wage, color = sex)) +
geom_point()
CPS85%>%
ggplot(aes(x = age, y = wage, fill = sex)) + #fill does not work for points!
geom_point()
NA
NA
establish the frame
plot the glyphs (i.e., select a geom)
map the aesthetics
add labels and title
other features (e.g., alpha, sizing, etc)
Establish the Frame
ggplot(data = diamonds)
ggplot(data = diamonds) +
geom_point()
Error in `geom_point()`:
! Problem while setting up geom.
ℹ Error occurred in the 1st layer.
Caused by error in `compute_geom_1()`:
! `geom_point()` requires the following missing aesthetics: x and y
Backtrace:
1. base (local) `<fn>`(x)
2. ggplot2:::print.ggplot(x)
4. ggplot2:::ggplot_build.ggplot(x)
5. ggplot2:::by_layer(...)
12. ggplot2 (local) f(l = layers[[i]], d = data[[i]])
13. l$compute_geom_1(d)
14. ggplot2 (local) compute_geom_1(..., self = self)
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point()
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(aes(color = depth), alpha = 0.5, size = 1) +
ggtitle("Diamonds Data") +
xlab("Carat") +
ylab("Price")
Notice that I can have aes inside multiple statements.
Notice that when I use constants (like
alpha = 0.3, size = 0.1) they ARE NOT inside
aes.
In general, variables go inside aes and constants go
outside of it. (unless we are using facets then see previous
materials.)
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(aes(colour = depth), alpha = 0.3, size = 0.1) +
ggtitle("Diamonds Data") +
xlab("Carat") +
ylab("Price") +
facet_grid( cut ~ color)
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(colour = "red", alpha = 0.3, size = 0.1) +
ggtitle("Diamonds Data") +
xlab("Carat") +
ylab("Price") +
facet_grid( cut ~ color)
aesaes can either go inside the ggplot()
function, or inside the geom_[chart]() function itself, or
both. The 3 following options create the same plots, but the code is
slightly different.
#option 1
ggplot(data = diamonds, ) +
geom_point(aes(x = carat, y = price, color = clarity),
alpha = 0.2,
size = 1) +
geom_smooth(method = "glm" ,
formula = y ~ poly(x, 2), # y = b_0 + b_1 x + b_2 x^2 + e
aes(x = carat, y = price),
color = "red") +
ylim(c(0, 20000))
#option 2
ggplot(data = diamonds, aes(x = carat, y = price, color = clarity)) +
geom_point(alpha = 0.2,
size = 1) +
geom_smooth(method = "glm" ,
formula = y ~ poly(x, 2), # y = b_0 + b_1 x + b_2 x^2 + e
aes(x = carat, y = price),
color = "red") +
ylim(c(0, 20000))
#Option 3
ggplot(data = diamonds, aes(x = carat, y = price) )+
geom_point( aes(color = clarity),
alpha = 0.2,
size = 1) +
geom_smooth(method = "glm" ,
formula = y ~ poly(x, 2), # y = b_0 + b_1 x + b_2 x^2 + e
color = "red") +
ylim(c(0, 20000))
I personally prefer to put “global” aesthetics in the
ggplot() and “local” aesthetics in the
geom.
x and ycolor = clarity is not needed for
geom_smoothgeom_point and geom_smooth use
x and y so I put them in the
ggplot()geom_point uses color = clarity so I
put that ONLY in the geom_point functionIn my opinion, Option 3 is the “cleanest” code. This is partly based on stylistic preference and partly based on some internal mechanic of ggplot’s (that is beyond the scope of this course). How you write your code is up to you. Just keep it readable!
But again, all 3 codes generate the the exact same plot (so does it really matter that much which option we use??)